library(h5)
library(MASS)
library(dplyr)
library(ggplot2)
library(plotly)
library(Rtsne)
library(pheatmap)
library(data.table)
library(knitr)
library(RColorBrewer)
source("../r_functions/PublicationTheme_ggplot.R")
source("../r_functions/feature_class.R")
source("../r_functions/Color_Palette.R")
knitr::opts_chunk$set(cache = TRUE, warning = FALSE,
message = FALSE, cache.lazy = FALSE)
htmldir = "/collab-ag-fischer/PROMISE/data-10x-4t-c-16z/htmldir"
species = "mouse"
feature_type = "organoids"
colorScale = get_color_palette(species)
I reduce the features and select the most informative features using a PCA, i.e. I retain the features that contribute the most to the variance as measured by their relative contribution to each PC component.
dmso_dir = sprintf("%s/DMSO/results", species)
f = h5file(file.path(dmso_dir, sprintf("ReducedFeatures_%s_DMSO.h5", species)), "r")
features = data.frame(t(f[sprintf("features_%s", feature_type)][]))
colnames(features) = f[sprintf("feature_names_%s", feature_type)][]
metadata = data.frame(f[sprintf("metadata_%s", feature_type)][], stringsAsFactors = FALSE)
colnames(metadata) = f[sprintf("metadata_names_%s", feature_type)][]
h5close(f)
The features are sampled to ensure a balance between classes.
sample_size = min(table(metadata$Line))
features_organoids = features %>%
group_by("Line" = metadata$Line) %>%
sample_n(sample_size) %>% as.data.frame
I want to test several feature set sizes to compare them visually
expl_var_plots = list()
lda_proj_plots = list()
num_features_list = c(5, 10, 25, 50, 100, 200)
num_features_list = num_features_list[num_features_list < ncol(features)]
num_features_list = c(num_features_list, ncol(features))
for(num_features in num_features_list) {
model = lda(
x=features_organoids %>% select(-Line) %>%
select(seq_len(num_features)),
grouping=features_organoids$Line)
plda = predict(object = model, newdata = features_organoids %>% select(-Line) %>%
select(seq_len(num_features)))
transformed = as.data.frame(plda$x)
transformed$Line = features_organoids$Line
# Accuracy
acc = mean(as.character(plda$class) == features_organoids$Line)
print(sprintf("Accuracy for %s features = %s", num_features, acc))
# Explained Variance
expl_var_plots[[length(expl_var_plots) + 1]] = ggplot_df = data.frame(
"Component" = paste0("LD", seq_along(model$svd)),
"Value" = model$svd**2 / sum(model$svd**2),
"NumFeatures" = num_features)
# ggplot(data = ggplot_df) +
# geom_col(aes(x = Component, y = Value)) +
# xlab("") + ylab("") + ggtitle("Proportion of Variance") +
# theme_Publication() + scale_fill_Publication() +
# theme(legend.position = "none")
# 2D LDA Projection
transformed$NumFeatures = num_features
lda_proj_plots[[length(lda_proj_plots) + 1]] = transformed[, c("LD1", "LD2", "Line", "NumFeatures")]
# ggplot(data = transformed) +
# geom_point(aes(x = LD1, y = LD2, color = Line)) +
# theme_Publication() + scale_fill_Publication() +
# theme(legend.position = "right", legend.direction = "vertical",
# legend.key.size = unit(0.5, "cm"))
}
## [1] "Accuracy for 5 features = 0.637280230259736"
## [1] "Accuracy for 10 features = 0.731456115407289"
## [1] "Accuracy for 25 features = 0.83108506432708"
## [1] "Accuracy for 50 features = 0.879408398931928"
## [1] "Accuracy for 100 features = 0.912109442729826"
## [1] "Accuracy for 157 features = 0.922304678017824"
lda_proj_plots = do.call(rbind, lda_proj_plots)
expl_var_plots = do.call(rbind, expl_var_plots)
ggplot(data = lda_proj_plots) +
geom_point(aes(x = LD1, y = LD2, color = Line)) +
facet_wrap(facets = ~ NumFeatures) +
theme_Publication() + scale_color_manual(values = colorScale) +
theme(legend.position = "right", legend.direction = "vertical",
legend.key.size = unit(0.5, "cm"))
A better separation becomes visible if features are averaged across wells
well_id = paste0(metadata$Plate, "_", metadata$Well)
features_wells = aggregate(x = features, by = list(well_id), FUN = median)
features_wells$Line = substr(features_wells$Group.1, 1, 7)
features_wells$Group.1 = NULL
expl_var_plots = list()
lda_proj_plots = list()
contribs = list()
num_features_list = c(5, 10, 25, 50, 100, 200)
num_features_list = num_features_list[num_features_list < ncol(features)]
num_features_list = c(num_features_list, ncol(features))
for(num_features in num_features_list) {
model = lda(
x=features_wells %>% select(-Line) %>%
select(seq_len(num_features)),
grouping=features_wells$Line)
plda = predict(object = model, newdata = features_wells %>% select(-Line) %>%
select(seq_len(num_features)))
transformed = as.data.frame(plda$x)
transformed$Line = features_wells$Line
# Accuracy
acc = mean(as.character(plda$class) == features_wells$Line)
print(sprintf("Accuracy for %s features = %s", num_features, acc))
# Explained Variance
expl_var_plots[[length(expl_var_plots) + 1]] = data.frame(
"Component" = paste0("LD", seq_along(model$svd)),
"Value" = model$svd**2 / sum(model$svd**2),
"NumFeatures" = num_features)
# ggplot(data = ggplot_df) +
# geom_col(aes(x = Component, y = Value)) +
# xlab("") + ylab("") + ggtitle("Proportion of Variance") +
# theme_Publication() + scale_fill_Publication() +
# theme(legend.position = "none")
# 2D LDA Projection
transformed$NumFeatures = num_features
lda_proj_plots[[length(lda_proj_plots) + 1]] = transformed[
, c("LD1", "LD2", "LD3", "Line", "NumFeatures")]
# ggplot(data = transformed) +
# geom_point(aes(x = LD1, y = LD2, color = Line)) +
# theme_Publication() + scale_fill_Publication() +
# theme(legend.position = "right", legend.direction = "vertical",
# legend.key.size = unit(0.5, "cm"))
# Feature contributions to LD1 and LD2
contrib = list()
for(ldc in colnames(model$scaling)) {
contrib[[ldc]] = model$scaling[,ldc]
contrib[[ldc]] = contrib[[ldc]][order(abs(contrib[[ldc]]), decreasing = TRUE)]
contrib[[ldc]] = data.frame(
"var" = names(contrib[[ldc]]),
"value" = contrib[[ldc]],
"LD" = ldc,
"NumFeatures" = num_features)
}
contribs[[length(contribs) + 1]] = do.call(rbind, contrib)
}
## [1] "Accuracy for 5 features = 0.919755185311119"
## [1] "Accuracy for 10 features = 0.975518531111867"
## [1] "Accuracy for 25 features = 0.994219653179191"
## [1] "Accuracy for 50 features = 0.998979938796328"
## [1] "Accuracy for 100 features = 0.998979938796328"
## [1] "Accuracy for 157 features = 0.999659979598776"
lda_proj_plots = do.call(rbind, lda_proj_plots)
expl_var_plots = do.call(rbind, expl_var_plots)
contribs = do.call(rbind, contribs)
ggplot(data = lda_proj_plots) +
geom_point(aes(x = LD1, y = LD2, color = Line)) +
facet_wrap(facets = ~ NumFeatures) +
theme_Publication() + scale_color_manual(values = colorScale) +
theme(legend.position = "right", legend.direction = "vertical",
legend.key.size = unit(0.5, "cm"))
ggplot(data = lda_proj_plots) +
geom_point(aes(x = LD2, y = LD3, color = Line)) +
facet_wrap(facets = ~ NumFeatures) +
theme_Publication() + scale_color_manual(values = colorScale) +
theme(legend.position = "right", legend.direction = "vertical",
legend.key.size = unit(0.5, "cm"))
expl_var_plots = list()
pca_plots = list()
contribs = list()
num_features_list = c(5, 10, 25, 50, 100, 200)
num_features_list = num_features_list[num_features_list < ncol(features)]
num_features_list = c(num_features_list, ncol(features))
for(num_features in num_features_list) {
model = prcomp(
x = features_wells %>% select(-Line) %>%
select(seq_len(num_features)),
scale. = TRUE, center = TRUE)
transformed = as.data.frame(model$x)
transformed$Line = features_wells$Line
# Explained Variance
expl_var_plots[[length(expl_var_plots) + 1]] = data.frame(
"Component" = paste0("PC", seq_along(model$sdev)),
"Value" = model$sdev / sum(model$sdev),
"NumFeatures" = num_features)
# 2D LDA Projection
transformed$NumFeatures = num_features
pca_plots[[length(pca_plots) + 1]] = transformed[, c("PC1", "PC2", "Line", "NumFeatures")]
}
pca_plots = do.call(rbind, pca_plots)
expl_var_plots = do.call(rbind, expl_var_plots)
ggplot(data = pca_plots) +
geom_point(aes(x = PC1, y = PC2, color = Line)) +
facet_wrap(facets = ~ NumFeatures) +
theme_Publication() + scale_color_manual(values = colorScale) +
theme(legend.position = "right", legend.direction = "vertical",
legend.key.size = unit(0.5, "cm"))
tsne = Rtsne(features_wells %>% select(-Line), perplexity = 50)
ggplot_df = data.frame(
"Line" = features_wells$Line,
"X1" = tsne$Y[,1],
"X2" = tsne$Y[,2])
ggplot(data = ggplot_df) + geom_point(aes(x = X1, y = X2, color = Line)) +
theme_Publication() + xlab("X") + ylab("Y") +
theme(legend.position = "right", legend.direction = "vertical",
legend.key.size = unit(0.5, "cm")) +
scale_color_manual(values = colorScale)
I perform a clustering on the lines
# model = lda(
# x=features_wells %>% select(-Line),
# grouping=features_wells$Line)
# plda = predict(
# object = model,
# newdata = features_wells %>% select(-Line))
# transformed = as.data.frame(plda$x)
#
# # Explained Variance
# expl_var_plots = data.frame(
# "Component" = paste0("LD", seq_along(model$svd)),
# "Value" = model$svd**2 / sum(model$svd**2),
# "NumFeatures" = num_features)
model = prcomp(
x = features_wells %>% select(-Line) %>%
select(seq_len(num_features)),
scale. = TRUE, center = TRUE)
transformed = as.data.frame(model$x)
transformed$Line = features_wells$Line
features_lines = aggregate(
x = transformed %>% select(-Line),
by = list(features_wells$Line),
FUN = mean)
rownames(features_lines) = features_lines$Group.1
features_lines$Group.1 = NULL
d = dist(features_lines)
hc = hclust(d, method = "complete")
pheatmap(
mat = t(as.matrix(features_lines[, 1:5])), cluster_rows = FALSE,
cluster_cols = hc)
set.seed(100)
num_imgs = 3
metadata_wells = aggregate(x = metadata, by = list(well_id), FUN = first)
random_wells = metadata_wells %>% dplyr::group_by(Line) %>% sample_n(num_imgs)
all_imgs = list()
captions = c()
for(line in sort(unique(random_wells$Line))) {
subdat = random_wells[random_wells$Line == line, ]
imgs = list()
plates = c()
for(ii in seq_len(num_imgs)) {
plate = subdat[ii, "Plate"]
plates = c(plates, as.character(plate))
row = substr(subdat[ii, "Well"], 1, 1)
col = substr(subdat[ii, "Well"], 2, 3)
img_url = file.path(htmldir, plate, sprintf("%s_%s_%s_1.jpeg", plate, row, col))
imgs[[ii]] = EBImage::readImage(img_url)
}
all_imgs[[length(all_imgs)+1]] = EBImage::abind(imgs, along = 1)
captions = c(captions, paste0(plates, collapse = ", "))
}
EBImage::display(all_imgs[[1]])
Figure 1: M001A03P006L05, M001A03P005L04, M001A03P009L02
EBImage::display(all_imgs[[2]])
Figure 2: M001B04P003L02, M001B04P008L07, M001B04P008L07
EBImage::display(all_imgs[[3]])
Figure 3: M001K02P012L05, M001K02P007L06, M001K02P009L02
EBImage::display(all_imgs[[4]])
Figure 4: M001W01P004L03, M001W01P010L03, M001W01P014L07
The features are sorted by importance, i.e. how well they distinguish organoid lines. I look at the first few features to see how lines cluster along them.
ggplot_df = melt(features_wells %>% select(1:12, Line), id.vars = "Line")
ggplot(data = ggplot_df) +
geom_histogram(mapping = aes(value, fill = Line),
position = "identity", alpha = 0.5, bins = 50) +
facet_wrap(facets = ~variable, nrow = 4) +
theme_Publication() + scale_fill_manual(values = colorScale) +
labs(title = "Histogram of Organoid Area", x = "", y = "")
features_lines = aggregate(features, list(metadata$Line), median)
# Reshape into a ggplot-compatible format
features_lines = melt(features_lines, id.vars = "Group.1")
# Rename features into more readable names
rename_dict = c(
"x.0.s.area" = "Area", "x.a.b.q05" = "Actin Intensity", "x.b.b.q05" = "FITC Intensity",
"x.c.b.q05" = "DAPI Intensity", "x.a.h.idm.s4" = "Actin Haralick IDM",
"x.Ba.h.sen.s8" = "Actin Haralick SEN")
features_lines = features_lines[features_lines$variable %in% names(rename_dict), ]
features_lines$variable = rename_dict[as.character(features_lines$variable)]
ggplot(data = features_lines) + geom_col(aes(x = variable, y = value, fill = variable)) +
facet_wrap(facets = ~Group.1) + theme_Publication() + xlab("") + ylab("") +
theme(legend.position = "right", legend.direction = "vertical",
axis.text.x = element_blank(), legend.key.size = unit(0.5, "cm")) +
scale_fill_Publication()